{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8b9a82c-ad03-4faf-81dc-ddfa47bc8acb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import re\n",
    "import statsmodels.api as sm\n",
    "import statsmodels.formula.api as smf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ad9876a-74fe-415c-bd39-9de1de55249a",
   "metadata": {},
   "source": [
    "# Analysis 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "026f7f84-96a5-4d6b-a263-8ae9a47710cd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "989ae2cd-47a5-4bf1-a46e-52deee31c093",
   "metadata": {},
   "outputs": [],
   "source": [
    "# =========================================================\n",
    "# Paths (edit if needed)\n",
    "# =========================================================\n",
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "CANDIDATES = [\n",
    "    BASE / \"df_H1.xlsx\"                    # fallback\n",
    "]\n",
    "\n",
    "# =========================================================\n",
    "# Helpers\n",
    "# =========================================================\n",
    "def find_input_path():\n",
    "    for p in CANDIDATES:\n",
    "        if p.exists():\n",
    "            return p\n",
    "    raise FileNotFoundError(\"No input file found among:\\n\" + \"\\n\".join(map(str, CANDIDATES)))\n",
    "\n",
    "def read_route_panel(path: Path) -> pd.DataFrame:\n",
    "    ext = path.suffix.lower()\n",
    "    if ext == \".parquet\":\n",
    "        df = pd.read_parquet(path)\n",
    "    elif ext == \".csv\":\n",
    "        df = pd.read_csv(path, encoding=\"utf-8-sig\")\n",
    "    elif ext in (\".xlsx\", \".xls\"):\n",
    "        try:\n",
    "            df = pd.read_excel(path, sheet_name=\"ROUTE_H1\")\n",
    "        except Exception:\n",
    "            df = pd.read_excel(path, sheet_name=0)\n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported input extension: {ext}\")\n",
    "\n",
    "    df.columns = [str(c).strip() for c in df.columns]\n",
    "    if \"ym\" not in df.columns:\n",
    "        raise KeyError(\"Input must have 'ym' (YYYYMM).\")\n",
    "\n",
    "    # normalize keys\n",
    "    df[\"ym\"] = df[\"ym\"].astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    for col in [\"출발시군구코드\", \"도착시군구코드\"]:\n",
    "        if col not in df.columns:\n",
    "            raise KeyError(f\"Missing column: {col}\")\n",
    "        df[col] = df[col].astype(str).str.strip()\n",
    "\n",
    "    # OD fixed-effect id\n",
    "    df[\"OD\"] = df[\"출발시군구코드\"] + \"_\" + df[\"도착시군구코드\"]\n",
    "    return df\n",
    "\n",
    "def as_term(colname: str) -> str:\n",
    "    \"\"\"\n",
    "    Return a formula-safe term. If the name contains any non [A-Za-z0-9_],\n",
    "    wrap with Q(\"...\") so patsy treats it as a literal column name.\n",
    "    \"\"\"\n",
    "    return f'Q(\"{colname}\")' if re.search(r\"[^0-9A-Za-z_]\", colname) else colname\n",
    "\n",
    "def tidy_and_filter_for_H1(df: pd.DataFrame):\n",
    "    need = {\n",
    "        \"Delta_log_price\": [\"Delta_log_price\"],\n",
    "        \"HW_time\": [\"HW_가중평균시간\"],\n",
    "        \"TOT_time\": [\"TOTAL_통근시간(분)\"],\n",
    "        \"HW_weight\": [\"HW_총이동인구\"],\n",
    "        \"TOT_weight\": [\"TOTAL_플로우_보조지표(최소값)\"],\n",
    "    }\n",
    "    cols = {}\n",
    "    for k, cands in need.items():\n",
    "        for c in cands:\n",
    "            if c in df.columns:\n",
    "                cols[k] = c\n",
    "                break\n",
    "        if k not in cols:\n",
    "            raise KeyError(f\"Cannot find column for {k}. Looked for {cands}\")\n",
    "\n",
    "    keep = [\"ym\", \"출발시군구코드\", \"도착시군구코드\", \"OD\",\n",
    "            cols[\"Delta_log_price\"], cols[\"HW_time\"], cols[\"HW_weight\"],\n",
    "            cols[\"TOT_time\"], cols[\"TOT_weight\"]]\n",
    "    df = df[keep].copy()\n",
    "\n",
    "    for c in [cols[\"Delta_log_price\"], cols[\"HW_time\"], cols[\"HW_weight\"],\n",
    "              cols[\"TOT_time\"], cols[\"TOT_weight\"]]:\n",
    "        df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
    "\n",
    "    hw_df = df.dropna(subset=[cols[\"HW_time\"], cols[\"Delta_log_price\"], cols[\"HW_weight\"]]).copy()\n",
    "    hw_df = hw_df.loc[hw_df[cols[\"HW_weight\"]] > 0].copy()\n",
    "\n",
    "    tot_df = df.dropna(subset=[cols[\"TOT_time\"], cols[\"Delta_log_price\"], cols[\"TOT_weight\"]]).copy()\n",
    "    tot_df = tot_df.loc[tot_df[cols[\"TOT_weight\"]] > 0].copy()\n",
    "\n",
    "    return hw_df, tot_df, cols\n",
    "\n",
    "def wls_fe_cluster(formula, data, weight_col, cluster_series):\n",
    "    w = data[weight_col].astype(float).values\n",
    "    model = smf.wls(formula=formula, data=data, weights=w)\n",
    "    res = model.fit(cov_type=\"cluster\", cov_kwds={\"groups\": cluster_series})\n",
    "    return res\n",
    "\n",
    "def print_core_result(res, coef_name, label):\n",
    "    if coef_name not in res.params.index:\n",
    "        print(f\"[WARN] coefficient '{coef_name}' not present. Full summary:\")\n",
    "        print(res.summary())\n",
    "        return\n",
    "    b  = res.params[coef_name]\n",
    "    se = res.bse[coef_name]\n",
    "    t  = res.tvalues[coef_name]\n",
    "    p  = res.pvalues[coef_name]\n",
    "    ci_low, ci_high = res.conf_int().loc[coef_name].tolist()\n",
    "    print(f\"\\n=== {label} | Key coefficient ===\")\n",
    "    print(f\"{coef_name:>20}: {b: .4f} (SE {se:.4f}, t={t:.2f}, p={p:.4g})\")\n",
    "    print(f\"95% CI: [{ci_low:.4f}, {ci_high:.4f}]\")\n",
    "    print(f\"Interpretation: 10% increase in price ratio ⇒ {0.10*b:.3f} minutes\")\n",
    "\n",
    "# =========================================================\n",
    "# 1) Load and prep\n",
    "# =========================================================\n",
    "in_path = find_input_path()\n",
    "print(f\"[INFO] Using input: {in_path}\")\n",
    "route = read_route_panel(in_path)\n",
    "\n",
    "hw_df, tot_df, cols = tidy_and_filter_for_H1(route)\n",
    "print(f\"[N] H1-HW rows:  {len(hw_df):,}\")\n",
    "print(f\"[N] H1-TOT rows: {len(tot_df):,}\")\n",
    "\n",
    "# =========================================================\n",
    "# 2) H1 — Morning HW minutes\n",
    "#     T^{HW}_{odt} = β Δℓ_{odt} + λ_{od} + τ_t + ε_{odt}\n",
    "#     weights = N^{HW}_{odt}, SEs clustered at destination d\n",
    "# =========================================================\n",
    "lhs_hw = as_term(cols[\"HW_time\"])\n",
    "rhs    = as_term(cols[\"Delta_log_price\"])\n",
    "formula_hw = f\"{lhs_hw} ~ {rhs} + C(OD) + C(ym)\"\n",
    "\n",
    "res_hw = wls_fe_cluster(\n",
    "    formula=formula_hw,\n",
    "    data=hw_df,\n",
    "    weight_col=cols[\"HW_weight\"],\n",
    "    cluster_series=hw_df[\"도착시군구코드\"]\n",
    ")\n",
    "print(res_hw.summary())\n",
    "print_core_result(res_hw, coef_name=cols[\"Delta_log_price\"], label=\"H1-HW\")\n",
    "\n",
    "# =========================================================\n",
    "# 3) Robustness — Daily round-trip minutes\n",
    "#     T^{TOT}_{odt} = β_tot Δℓ_{odt} + λ_{od} + τ_t + υ_{odt}\n",
    "#     weights = flow_min\n",
    "# =========================================================\n",
    "lhs_tot = as_term(cols[\"TOT_time\"])         # <-- safely quoted: TOTAL_통근시간(분)\n",
    "formula_tot = f\"{lhs_tot} ~ {rhs} + C(OD) + C(ym)\"\n",
    "\n",
    "res_tot = wls_fe_cluster(\n",
    "    formula=formula_tot,\n",
    "    data=tot_df,\n",
    "    weight_col=cols[\"TOT_weight\"],\n",
    "    cluster_series=tot_df[\"도착시군구코드\"]\n",
    ")\n",
    "print(res_tot.summary())\n",
    "print_core_result(res_tot, coef_name=cols[\"Delta_log_price\"], label=\"H1-TOT (robustness)\")\n",
    "\n",
    "# =========================================================\n",
    "# 4) Export concise results\n",
    "# =========================================================\n",
    "def export_coef_table(res, coef_name, out_csv):\n",
    "    tab = pd.DataFrame({\n",
    "        \"param\": res.params.index,\n",
    "        \"coef\":  res.params.values,\n",
    "        \"se\":    res.bse.values,\n",
    "        \"t\":     res.tvalues.values,\n",
    "        \"p\":     res.pvalues.values\n",
    "    })\n",
    "    tab.loc[tab[\"param\"] == coef_name].to_csv(out_csv, index=False, encoding=\"utf-8-sig\")\n",
    "    print(f\"[OK] wrote {out_csv}\")\n",
    "\n",
    "export_coef_table(res_hw,  cols[\"Delta_log_price\"], BASE / \"H1_HW_keycoef.csv\")\n",
    "export_coef_table(res_tot, cols[\"Delta_log_price\"], BASE / \"H1_TOT_keycoef.csv\")\n",
    "\n",
    "with open(BASE / \"H1_HW_full_summary.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(str(res_hw.summary()))\n",
    "with open(BASE / \"H1_TOT_full_summary.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(str(res_tot.summary()))\n",
    "print(\"[OK] wrote regression summaries.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "100cedf6-393a-48c2-8f1a-7d44524c2e26",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ede73b4d-8f92-40a9-820b-b5a2ca48c4b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "CANDIDATES = [\n",
    "    BASE / \"df_H1.xlsx\",                 # your input panel (Excel)\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2025.csv\",\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2025.parquet\",\n",
    "]\n",
    "\n",
    "# Output files\n",
    "OUT_HW_KEY   = BASE / \"H1_HW_keycoef.csv\"\n",
    "OUT_TOT_KEY  = BASE / \"H1_TOT_keycoef.csv\"\n",
    "OUT_WH_KEY   = BASE / \"H1_WH_keycoef.csv\"\n",
    "\n",
    "OUT_HW_FULL  = BASE / \"H1_HW_full_summary.txt\"\n",
    "OUT_TOT_FULL = BASE / \"H1_TOT_full_summary.txt\"\n",
    "OUT_WH_FULL  = BASE / \"H1_WH_full_summary.txt\"\n",
    "\n",
    "# =========================================================\n",
    "# Helpers\n",
    "# =========================================================\n",
    "def find_input_path():\n",
    "    for p in CANDIDATES:\n",
    "        if p.exists():\n",
    "            return p\n",
    "    raise FileNotFoundError(\"No input file found among:\\n\" + \"\\n\".join(map(str, CANDIDATES)))\n",
    "\n",
    "def read_route_panel(path: Path) -> pd.DataFrame:\n",
    "    ext = path.suffix.lower()\n",
    "    if ext == \".parquet\":\n",
    "        df = pd.read_parquet(path)\n",
    "    elif ext == \".csv\":\n",
    "        df = pd.read_csv(path, encoding=\"utf-8-sig\")\n",
    "    elif ext in (\".xlsx\", \".xls\"):\n",
    "        try:\n",
    "            df = pd.read_excel(path, sheet_name=\"ROUTE_H1\")\n",
    "        except Exception:\n",
    "            df = pd.read_excel(path, sheet_name=0)\n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported input extension: {ext}\")\n",
    "\n",
    "    df.columns = [str(c).strip() for c in df.columns]\n",
    "    if \"ym\" not in df.columns:\n",
    "        raise KeyError(\"Input must have 'ym' (YYYYMM).\")\n",
    "\n",
    "    # normalize keys\n",
    "    df[\"ym\"] = df[\"ym\"].astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    for col in [\"출발시군구코드\", \"도착시군구코드\"]:\n",
    "        if col not in df.columns:\n",
    "            raise KeyError(f\"Missing column: {col}\")\n",
    "        df[col] = df[col].astype(str).str.strip()\n",
    "\n",
    "    # OD fixed-effect id\n",
    "    df[\"OD\"] = df[\"출발시군구코드\"] + \"_\" + df[\"도착시군구코드\"]\n",
    "    return df\n",
    "\n",
    "def as_term(colname: str) -> str:\n",
    "    \"\"\"Quote non-ascii/space/paren names for patsy formulas.\"\"\"\n",
    "    return f'Q(\"{colname}\")' if re.search(r\"[^0-9A-Za-z_]\", colname) else colname\n",
    "\n",
    "def tidy_and_filter_for_H1(df: pd.DataFrame):\n",
    "    need = {\n",
    "        \"Delta_log_price\": [\"Delta_log_price\"],\n",
    "\n",
    "        \"HW_time\":   [\"HW_가중평균시간\"],\n",
    "        \"HW_weight\": [\"HW_총이동인구\"],\n",
    "\n",
    "        \"WH_time\":   [\"WH_가중평균시간\"],\n",
    "        \"WH_weight\": [\"WH_총이동인구\"],\n",
    "\n",
    "        \"TOT_time\":   [\"TOTAL_통근시간(분)\"],\n",
    "        \"TOT_weight\": [\"TOTAL_플로우_보조지표(최소값)\"],\n",
    "    }\n",
    "    cols = {}\n",
    "    for k, cands in need.items():\n",
    "        for c in cands:\n",
    "            if c in df.columns:\n",
    "                cols[k] = c\n",
    "                break\n",
    "        if k not in cols:\n",
    "            raise KeyError(f\"Cannot find column for {k}. Looked for {cands}\")\n",
    "\n",
    "    keep = [\n",
    "        \"ym\", \"출발시군구코드\", \"도착시군구코드\", \"OD\",\n",
    "        cols[\"Delta_log_price\"],\n",
    "        cols[\"HW_time\"], cols[\"HW_weight\"],\n",
    "        cols[\"WH_time\"], cols[\"WH_weight\"],\n",
    "        cols[\"TOT_time\"], cols[\"TOT_weight\"],\n",
    "    ]\n",
    "    df = df[keep].copy()\n",
    "\n",
    "    # coerce numerics\n",
    "    for c in [\n",
    "        cols[\"Delta_log_price\"],\n",
    "        cols[\"HW_time\"], cols[\"HW_weight\"],\n",
    "        cols[\"WH_time\"], cols[\"WH_weight\"],\n",
    "        cols[\"TOT_time\"], cols[\"TOT_weight\"],\n",
    "    ]:\n",
    "        df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
    "\n",
    "    # analysis samples\n",
    "    hw_df = df.dropna(subset=[cols[\"HW_time\"], cols[\"Delta_log_price\"], cols[\"HW_weight\"]]).copy()\n",
    "    hw_df = hw_df.loc[hw_df[cols[\"HW_weight\"]] > 0].copy()\n",
    "\n",
    "    wh_df = df.dropna(subset=[cols[\"WH_time\"], cols[\"Delta_log_price\"], cols[\"WH_weight\"]]).copy()\n",
    "    wh_df = wh_df.loc[wh_df[cols[\"WH_weight\"]] > 0].copy()\n",
    "\n",
    "    tot_df = df.dropna(subset=[cols[\"TOT_time\"], cols[\"Delta_log_price\"], cols[\"TOT_weight\"]]).copy()\n",
    "    tot_df = tot_df.loc[tot_df[cols[\"TOT_weight\"]] > 0].copy()\n",
    "\n",
    "    return hw_df, wh_df, tot_df, cols\n",
    "\n",
    "def wls_fe_cluster(formula, data, weight_col, cluster_series):\n",
    "    w = data[weight_col].astype(float).values\n",
    "    model = smf.wls(formula=formula, data=data, weights=w)\n",
    "    res = model.fit(cov_type=\"cluster\", cov_kwds={\"groups\": cluster_series})\n",
    "    return res\n",
    "\n",
    "def print_core_result(res, coef_name, label):\n",
    "    if coef_name not in res.params.index:\n",
    "        print(f\"[WARN] coefficient '{coef_name}' not present. Full summary:\")\n",
    "        print(res.summary())\n",
    "        return\n",
    "    b  = res.params[coef_name]\n",
    "    se = res.bse[coef_name]\n",
    "    t  = res.tvalues[coef_name]\n",
    "    p  = res.pvalues[coef_name]\n",
    "    ci_low, ci_high = res.conf_int().loc[coef_name].tolist()\n",
    "    print(f\"\\n=== {label} | Key coefficient ===\")\n",
    "    print(f\"{coef_name:>20}: {b: .4f} (SE {se:.4f}, t={t:.2f}, p={p:.4g})\")\n",
    "    print(f\"95% CI: [{ci_low:.4f}, {ci_high:.4f}]\")\n",
    "    print(f\"Interpretation: 10% increase in price ratio ⇒ {0.10*b:.3f} minutes\")\n",
    "\n",
    "def export_coef_table(res, coef_name, out_csv):\n",
    "    tab = pd.DataFrame({\n",
    "        \"param\": res.params.index,\n",
    "        \"coef\":  res.params.values,\n",
    "        \"se\":    res.bse.values,\n",
    "        \"t\":     res.tvalues.values,\n",
    "        \"p\":     res.pvalues.values\n",
    "    })\n",
    "    tab.loc[tab[\"param\"] == coef_name].to_csv(out_csv, index=False, encoding=\"utf-8-sig\")\n",
    "    print(f\"[OK] wrote {out_csv}\")\n",
    "\n",
    "# =========================================================\n",
    "# 1) Load and prep\n",
    "# =========================================================\n",
    "in_path = find_input_path()\n",
    "print(f\"[INFO] Using input: {in_path}\")\n",
    "route = read_route_panel(in_path)\n",
    "\n",
    "hw_df, wh_df, tot_df, cols = tidy_and_filter_for_H1(route)\n",
    "print(f\"[N] H1-HW rows:  {len(hw_df):,}\")\n",
    "print(f\"[N] H1-WH rows:  {len(wh_df):,}\")\n",
    "print(f\"[N] H1-TOT rows: {len(tot_df):,}\")\n",
    "\n",
    "# =========================================================\n",
    "# 2) H1 — Morning HW minutes\n",
    "#     T^{HW}_{odt} = β Δℓ_{odt} + λ_{od} + τ_t + ε_{odt}\n",
    "#     weights = N^{HW}_{odt}, SEs clustered at destination d\n",
    "# =========================================================\n",
    "lhs_hw = as_term(cols[\"HW_time\"])\n",
    "rhs    = as_term(cols[\"Delta_log_price\"])\n",
    "formula_hw = f\"{lhs_hw} ~ {rhs} + C(OD) + C(ym)\"\n",
    "\n",
    "res_hw = wls_fe_cluster(\n",
    "    formula=formula_hw,\n",
    "    data=hw_df,\n",
    "    weight_col=cols[\"HW_weight\"],\n",
    "    cluster_series=hw_df[\"도착시군구코드\"]\n",
    ")\n",
    "print(\"\\n\" + \"=\"*86)\n",
    "print(\"H1 — HW minutes on Δlog price (weights = HW flow, FE: OD & month)\")\n",
    "print(\"=\"*86)\n",
    "print(res_hw.summary())\n",
    "print_core_result(res_hw, coef_name=cols[\"Delta_log_price\"], label=\"H1-HW\")\n",
    "\n",
    "# =========================================================\n",
    "# 3) H1 — Evening WH minutes (added)\n",
    "#     T^{WH}_{odt} = β_WH Δℓ_{odt} + λ_{od} + τ_t + ε_{odt}^{WH}\n",
    "#     weights = N^{WH}_{odt}, SEs clustered at destination d (work district)\n",
    "# =========================================================\n",
    "lhs_wh = as_term(cols[\"WH_time\"])\n",
    "formula_wh = f\"{lhs_wh} ~ {rhs} + C(OD) + C(ym)\"\n",
    "\n",
    "res_wh = wls_fe_cluster(\n",
    "    formula=formula_wh,\n",
    "    data=wh_df,\n",
    "    weight_col=cols[\"WH_weight\"],\n",
    "    cluster_series=wh_df[\"도착시군구코드\"]\n",
    ")\n",
    "print(\"\\n\" + \"=\"*86)\n",
    "print(\"H1 — WH minutes on Δlog price (weights = WH flow, FE: OD & month)\")\n",
    "print(\"=\"*86)\n",
    "print(res_wh.summary())\n",
    "print_core_result(res_wh, coef_name=cols[\"Delta_log_price\"], label=\"H1-WH\")\n",
    "\n",
    "# =========================================================\n",
    "# 4) Robustness — Daily round-trip minutes\n",
    "#     T^{TOT}_{odt} = β_tot Δℓ_{odt} + λ_{od} + τ_t + υ_{odt}\n",
    "#     weights = flow_min (TOTAL_플로우_보조지표(최소값))\n",
    "# =========================================================\n",
    "lhs_tot = as_term(cols[\"TOT_time\"])         # safely quoted (has parentheses)\n",
    "formula_tot = f\"{lhs_tot} ~ {rhs} + C(OD) + C(ym)\"\n",
    "\n",
    "res_tot = wls_fe_cluster(\n",
    "    formula=formula_tot,\n",
    "    data=tot_df,\n",
    "    weight_col=cols[\"TOT_weight\"],\n",
    "    cluster_series=tot_df[\"도착시군구코드\"]\n",
    ")\n",
    "print(\"\\n\" + \"=\"*86)\n",
    "print(\"H1 — TOTAL minutes on Δlog price (weights = min(HW,WH) flow, FE: OD & month)\")\n",
    "print(\"=\"*86)\n",
    "print(res_tot.summary())\n",
    "print_core_result(res_tot, coef_name=cols[\"Delta_log_price\"], label=\"H1-TOT (robustness)\")\n",
    "\n",
    "# =========================================================\n",
    "# 5) Export concise results + full summaries\n",
    "# =========================================================\n",
    "export_coef_table(res_hw,  cols[\"Delta_log_price\"], OUT_HW_KEY)\n",
    "export_coef_table(res_wh,  cols[\"Delta_log_price\"], OUT_WH_KEY)\n",
    "export_coef_table(res_tot, cols[\"Delta_log_price\"], OUT_TOT_KEY)\n",
    "\n",
    "with open(OUT_HW_FULL, \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(str(res_hw.summary()))\n",
    "with open(OUT_WH_FULL, \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(str(res_wh.summary()))\n",
    "with open(OUT_TOT_FULL, \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(str(res_tot.summary()))\n",
    "print(\"[OK] wrote regression summaries and key-coefficient CSVs.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "816fca3c-90c7-48fb-8891-c6bfe25870f5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "c5c74d76-6e4c-4cf4-acdb-2c50341ed31c",
   "metadata": {},
   "source": [
    "## Analysis 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5e0262c-7a66-4a48-b47a-ed9140f68eff",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "CANDIDATES = [\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025_DESTinSeoul.parquet\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025_DESTinSeoul.csv\",\n",
    "    BASE / \"df_H2.xlsx\"                    # optional fallback\n",
    "]\n",
    "\n",
    "# =========================\n",
    "# Helpers\n",
    "# =========================\n",
    "def find_input_path():\n",
    "    for p in CANDIDATES:\n",
    "        if p.exists():\n",
    "            return p\n",
    "    raise FileNotFoundError(\"No input file found among:\\n\" + \"\\n\".join(map(str, CANDIDATES)))\n",
    "\n",
    "def read_dest_panel(path: Path) -> pd.DataFrame:\n",
    "    ext = path.suffix.lower()\n",
    "    if ext == \".parquet\":\n",
    "        df = pd.read_parquet(path)\n",
    "    elif ext == \".csv\":\n",
    "        df = pd.read_csv(path, encoding=\"utf-8-sig\")\n",
    "    elif ext in (\".xlsx\", \".xls\"):\n",
    "        # Try the expected sheet name first\n",
    "        try:\n",
    "            df = pd.read_excel(path, sheet_name=\"DEST_H2H3\")\n",
    "        except Exception:\n",
    "            df = pd.read_excel(path, sheet_name=0)\n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported input extension: {ext}\")\n",
    "\n",
    "    df.columns = [str(c).strip() for c in df.columns]\n",
    "\n",
    "    # Essential keys\n",
    "    if \"ym\" not in df.columns:\n",
    "        raise KeyError(\"Panel must contain 'ym' (YYYYMM).\")\n",
    "    if \"도착시군구코드\" not in df.columns:\n",
    "        raise KeyError(\"Panel must contain '도착시군구코드' (destination district code).\")\n",
    "    if \"GapPrice\" not in df.columns:\n",
    "        raise KeyError(\"Panel must contain 'GapPrice' (ln P_d - inbound-weighted ln P_o).\")\n",
    "    # At least one inequality outcome\n",
    "    outcands = [\"GAP_inbound_vs_within_TOTAL\", \"GAP_inbound_vs_within_HW\", \"GAP_inbound_vs_within_WH\"]\n",
    "    if not any(c in df.columns for c in outcands):\n",
    "        raise KeyError(f\"Panel must contain one of {outcands} as outcome(s).\")\n",
    "\n",
    "    # Clean keys\n",
    "    df[\"month\"] = df[\"ym\"].astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    df[\"dest\"] = df[\"도착시군구코드\"].astype(str).str.strip()\n",
    "\n",
    "    # Ensure numeric\n",
    "    for c in [\"GapPrice\", \"HW_flow_total\", \"HW_flow_inbound\"] + outcands:\n",
    "        if c in df.columns:\n",
    "            df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
    "\n",
    "    return df\n",
    "\n",
    "def model_wls_fe_cluster(data, y, x=\"GapPrice\", fe_dest=\"dest\", fe_month=\"month\",\n",
    "                         weight_col=None, cluster_col=\"dest\"):\n",
    "    \"\"\"\n",
    "    Weighted least squares with district and month fixed effects, SEs clustered at destination.\n",
    "    \"\"\"\n",
    "    # Build a patsy-safe LHS/RHS\n",
    "    def as_term(colname: str) -> str:\n",
    "        return f'Q(\"{colname}\")' if re.search(r\"[^0-9A-Za-z_]\", colname) else colname\n",
    "\n",
    "    lhs = as_term(y)\n",
    "    rhs = as_term(x)\n",
    "    # Use ascii aliases for FE: C(dest) and C(month)\n",
    "    formula = f\"{lhs} ~ {rhs} + C({fe_dest}) + C({fe_month})\"\n",
    "\n",
    "    if weight_col is None:\n",
    "        model = smf.ols(formula=formula, data=data)\n",
    "    else:\n",
    "        w = data[weight_col].astype(float).values\n",
    "        model = smf.wls(formula=formula, data=data, weights=w)\n",
    "\n",
    "    res = model.fit(cov_type=\"cluster\", cov_kwds={\"groups\": data[cluster_col]})\n",
    "    return res\n",
    "\n",
    "def print_keycoef(res, coef=\"GapPrice\", label=\"H2\"):\n",
    "    if coef not in res.params.index:\n",
    "        print(f\"[WARN] '{coef}' not in params; full summary below.\")\n",
    "        print(res.summary())\n",
    "        return\n",
    "    b  = res.params[coef]\n",
    "    se = res.bse[coef]\n",
    "    t  = res.tvalues[coef]\n",
    "    p  = res.pvalues[coef]\n",
    "    lo, hi = res.conf_int().loc[coef].tolist()\n",
    "    print(f\"\\n=== {label} | Key coefficient on {coef} ===\")\n",
    "    print(f\"{coef:>12}: {b: .4f} (SE {se:.4f}, t={t:.2f}, p={p:.4g})\")\n",
    "    print(f\"95% CI: [{lo:.4f}, {hi:.4f}]\")\n",
    "    print(f\"Interpretation: +0.10 in GapPrice (≈10% higher work vs. origins) ⇒ {0.10*b:.3f} minutes in the gap\")\n",
    "\n",
    "def export_keycoef(res, coef, path: Path):\n",
    "    tab = pd.DataFrame({\n",
    "        \"param\": res.params.index,\n",
    "        \"coef\":  res.params.values,\n",
    "        \"se\":    res.bse.values,\n",
    "        \"t\":     res.tvalues.values,\n",
    "        \"p\":     res.pvalues.values\n",
    "    })\n",
    "    tab.loc[tab[\"param\"] == coef].to_csv(path, index=False, encoding=\"utf-8-sig\")\n",
    "    print(f\"[OK] wrote {path}\")\n",
    "\n",
    "# =========================\n",
    "# 1) Load data\n",
    "# =========================\n",
    "in_path = find_input_path()\n",
    "print(f\"[INFO] Using input: {in_path}\")\n",
    "dest = read_dest_panel(in_path)\n",
    "\n",
    "# Outcomes present?\n",
    "has_total = \"GAP_inbound_vs_within_TOTAL\" in dest.columns\n",
    "has_hw    = \"GAP_inbound_vs_within_HW\"    in dest.columns\n",
    "has_wh    = \"GAP_inbound_vs_within_WH\"    in dest.columns\n",
    "\n",
    "# =========================\n",
    "# 2) Main sample: drop missing essentials\n",
    "# =========================\n",
    "def prep_sample(df: pd.DataFrame, ycol: str, need_weight: bool):\n",
    "    cols_need = [\"GapPrice\", \"dest\", \"month\", ycol]\n",
    "    if need_weight:\n",
    "        # preferred weight is inbound flow; if missing or zero, drop\n",
    "        weight_col = \"HW_flow_inbound\" if \"HW_flow_inbound\" in df.columns else None\n",
    "        if weight_col is None:\n",
    "            print(\"[INFO] Inbound flow weight not found; proceeding unweighted.\")\n",
    "        else:\n",
    "            cols_need.append(weight_col)\n",
    "    else:\n",
    "        weight_col = None\n",
    "\n",
    "    dd = df[cols_need].copy()\n",
    "    dd = dd.dropna(subset=[\"GapPrice\", ycol])\n",
    "    if weight_col is not None:\n",
    "        dd = dd.dropna(subset=[weight_col])\n",
    "        dd = dd.loc[dd[weight_col] > 0]\n",
    "\n",
    "    return dd, weight_col\n",
    "\n",
    "# =========================\n",
    "# 3) H2 — TOTAL gap on GapPrice\n",
    "# =========================\n",
    "if has_total:\n",
    "    # Weighted by inbound HW flow (preferred)\n",
    "    d_w, wcol = prep_sample(dest, \"GAP_inbound_vs_within_TOTAL\", need_weight=True)\n",
    "    if wcol is not None:\n",
    "        res_tot_w = model_wls_fe_cluster(d_w, y=\"GAP_inbound_vs_within_TOTAL\", x=\"GapPrice\",\n",
    "                                         weight_col=wcol, cluster_col=\"dest\")\n",
    "        print(res_tot_w.summary())\n",
    "        print_keycoef(res_tot_w, coef=\"GapPrice\", label=\"H2-TOTAL (weighted)\")\n",
    "        export_keycoef(res_tot_w, \"GapPrice\", BASE / \"H2_TOTAL_keycoef_weighted.csv\")\n",
    "        with open(BASE / \"H2_TOTAL_full_summary_weighted.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(str(res_tot_w.summary()))\n",
    "\n",
    "    # Unweighted (robustness)\n",
    "    d_u, _ = prep_sample(dest, \"GAP_inbound_vs_within_TOTAL\", need_weight=False)\n",
    "    res_tot_u = model_wls_fe_cluster(d_u, y=\"GAP_inbound_vs_within_TOTAL\", x=\"GapPrice\",\n",
    "                                     weight_col=None, cluster_col=\"dest\")\n",
    "    print(res_tot_u.summary())\n",
    "    print_keycoef(res_tot_u, coef=\"GapPrice\", label=\"H2-TOTAL (unweighted)\")\n",
    "    export_keycoef(res_tot_u, \"GapPrice\", BASE / \"H2_TOTAL_keycoef_unweighted.csv\")\n",
    "    with open(BASE / \"H2_TOTAL_full_summary_unweighted.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(str(res_tot_u.summary()))\n",
    "else:\n",
    "    print(\"[WARN] 'GAP_inbound_vs_within_TOTAL' not found; skipping TOTAL model.\")\n",
    "\n",
    "# =========================\n",
    "# 4) Robustness — HW and WH gaps separately\n",
    "# =========================\n",
    "if has_hw:\n",
    "    d_hw_w, wcol = prep_sample(dest, \"GAP_inbound_vs_within_HW\", need_weight=True)\n",
    "    if wcol is not None and len(d_hw_w):\n",
    "        res_hw_w = model_wls_fe_cluster(d_hw_w, y=\"GAP_inbound_vs_within_HW\", x=\"GapPrice\",\n",
    "                                        weight_col=wcol, cluster_col=\"dest\")\n",
    "        print(res_hw_w.summary())\n",
    "        print_keycoef(res_hw_w, label=\"H2-HW (weighted)\")\n",
    "        export_keycoef(res_hw_w, \"GapPrice\", BASE / \"H2_HW_keycoef_weighted.csv\")\n",
    "        with open(BASE / \"H2_HW_full_summary_weighted.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(str(res_hw_w.summary()))\n",
    "\n",
    "    d_hw_u, _ = prep_sample(dest, \"GAP_inbound_vs_within_HW\", need_weight=False)\n",
    "    if len(d_hw_u):\n",
    "        res_hw_u = model_wls_fe_cluster(d_hw_u, y=\"GAP_inbound_vs_within_HW\", x=\"GapPrice\",\n",
    "                                        weight_col=None, cluster_col=\"dest\")\n",
    "        print(res_hw_u.summary())\n",
    "        print_keycoef(res_hw_u, label=\"H2-HW (unweighted)\")\n",
    "        export_keycoef(res_hw_u, \"GapPrice\", BASE / \"H2_HW_keycoef_unweighted.csv\")\n",
    "        with open(BASE / \"H2_HW_full_summary_unweighted.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(str(res_hw_u.summary()))\n",
    "\n",
    "if has_wh:\n",
    "    d_wh_w, wcol = prep_sample(dest, \"GAP_inbound_vs_within_WH\", need_weight=True)\n",
    "    if wcol is not None and len(d_wh_w):\n",
    "        res_wh_w = model_wls_fe_cluster(d_wh_w, y=\"GAP_inbound_vs_within_WH\", x=\"GapPrice\",\n",
    "                                        weight_col=wcol, cluster_col=\"dest\")\n",
    "        print(res_wh_w.summary())\n",
    "        print_keycoef(res_wh_w, label=\"H2-WH (weighted)\")\n",
    "        export_keycoef(res_wh_w, \"GapPrice\", BASE / \"H2_WH_keycoef_weighted.csv\")\n",
    "        with open(BASE / \"H2_WH_full_summary_weighted.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(str(res_wh_w.summary()))\n",
    "\n",
    "    d_wh_u, _ = prep_sample(dest, \"GAP_inbound_vs_within_WH\", need_weight=False)\n",
    "    if len(d_wh_u):\n",
    "        res_wh_u = model_wls_fe_cluster(d_wh_u, y=\"GAP_inbound_vs_within_WH\", x=\"GapPrice\",\n",
    "                                        weight_col=None, cluster_col=\"dest\")\n",
    "        print(res_wh_u.summary())\n",
    "        print_keycoef(res_wh_u, label=\"H2-WH (unweighted)\")\n",
    "        export_keycoef(res_wh_u, \"GapPrice\", BASE / \"H2_WH_keycoef_unweighted.csv\")\n",
    "        with open(BASE / \"H2_WH_full_summary_unweighted.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(str(res_wh_u.summary()))\n",
    "\n",
    "print(\"[OK] H2 estimation completed.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae964ca9-d02a-41de-b73f-44b8dd7b0ed5",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "CANDIDATES = [\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025_DESTinSeoul.parquet\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025_DESTinSeoul.csv\",\n",
    "    BASE / \"df_H2.xlsx\"                    # optional fallback\n",
    "]\n",
    "OUT_TXT = {\n",
    "    \"TOTAL\": BASE / \"H2_TOTAL_full_summary_weighted.txt\",\n",
    "    \"HW\":    BASE / \"H2_HW_full_summary_weighted.txt\",\n",
    "    \"WH\":    BASE / \"H2_WH_full_summary_weighted.txt\",\n",
    "}\n",
    "\n",
    "# =========================================\n",
    "# Load panel\n",
    "# =========================================\n",
    "def find_input_path():\n",
    "    for p in CANDIDATES:\n",
    "        if p.exists():\n",
    "            return p\n",
    "    raise FileNotFoundError(\"No input file found among:\\n\" + \"\\n\".join(map(str, CANDIDATES)))\n",
    "\n",
    "def read_dest_panel(path: Path) -> pd.DataFrame:\n",
    "    ext = path.suffix.lower()\n",
    "    if ext == \".parquet\":\n",
    "        df = pd.read_parquet(path)\n",
    "    elif ext == \".csv\":\n",
    "        df = pd.read_csv(path, encoding=\"utf-8-sig\")\n",
    "    elif ext in (\".xlsx\", \".xls\"):\n",
    "        try:\n",
    "            df = pd.read_excel(path, sheet_name=\"DEST_H2H3\")\n",
    "        except Exception:\n",
    "            df = pd.read_excel(path, sheet_name=0)\n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported input extension: {ext}\")\n",
    "\n",
    "    df.columns = [str(c).strip() for c in df.columns]\n",
    "\n",
    "    req = [\"ym\", \"도착시군구코드\", \"GapPrice\",\n",
    "           \"GAP_inbound_vs_within_TOTAL\",\n",
    "           \"GAP_inbound_vs_within_HW\",\n",
    "           \"GAP_inbound_vs_within_WH\",\n",
    "           \"HW_flow_inbound\"]\n",
    "    missing = [c for c in req if c not in df.columns]\n",
    "    if missing:\n",
    "        raise KeyError(f\"Missing required columns: {missing}\")\n",
    "\n",
    "    # Clean keys\n",
    "    df[\"month\"] = df[\"ym\"].astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    df[\"dest\"]  = df[\"도착시군구코드\"].astype(str).str.strip()\n",
    "\n",
    "    # Ensure numeric\n",
    "    for c in [\"GapPrice\", \"GAP_inbound_vs_within_TOTAL\",\n",
    "              \"GAP_inbound_vs_within_HW\", \"GAP_inbound_vs_within_WH\",\n",
    "              \"HW_flow_inbound\"]:\n",
    "        df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
    "\n",
    "    return df\n",
    "\n",
    "# =========================================\n",
    "# Estimation (weighted WLS, FE, clustered SE)\n",
    "# =========================================\n",
    "def run_weighted_h2(df: pd.DataFrame, ycol: str, label: str):\n",
    "    # Build analysis sample\n",
    "    dat = df[[\"dest\", \"month\", \"GapPrice\", ycol, \"HW_flow_inbound\"]].copy()\n",
    "    dat = dat.dropna(subset=[\"GapPrice\", ycol, \"HW_flow_inbound\"])\n",
    "    dat = dat.loc[dat[\"HW_flow_inbound\"] > 0]\n",
    "\n",
    "    # WLS with FE: C(dest) + C(month), cluster at destination\n",
    "    formula = f\"{ycol} ~ GapPrice + C(dest) + C(month)\"\n",
    "    mdl = smf.wls(formula=formula, data=dat, weights=dat[\"HW_flow_inbound\"].astype(float).values)\n",
    "    res = mdl.fit(cov_type=\"cluster\", cov_kwds={\"groups\": dat[\"dest\"]})\n",
    "\n",
    "    # Print FULL summary and save to file\n",
    "    print(\"\\n\" + \"=\"*88)\n",
    "    print(f\"Full results: H2-{label} (weighted by inbound HW flow)\")\n",
    "    print(\"=\"*88)\n",
    "    print(res.summary())\n",
    "\n",
    "    with open(OUT_TXT[label], \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(str(res.summary()))\n",
    "    print(f\"[OK] wrote {OUT_TXT[label]}\")\n",
    "\n",
    "    return res\n",
    "\n",
    "# =========================================\n",
    "# Run all three weighted models (TOTAL, HW, WH)\n",
    "# =========================================\n",
    "if __name__ == \"__main__\":\n",
    "    in_path = find_input_path()\n",
    "    print(f\"[INFO] Using input: {in_path}\")\n",
    "    dest = read_dest_panel(in_path)\n",
    "\n",
    "    # TOTAL gap\n",
    "    res_total = run_weighted_h2(dest, \"GAP_inbound_vs_within_TOTAL\", \"TOTAL\")\n",
    "\n",
    "    # HW gap\n",
    "    res_hw = run_weighted_h2(dest, \"GAP_inbound_vs_within_HW\", \"HW\")\n",
    "\n",
    "    # WH gap\n",
    "    res_wh = run_weighted_h2(dest, \"GAP_inbound_vs_within_WH\", \"WH\")\n",
    "\n",
    "    print(\"\\n[DONE] H2 weighted models (TOTAL, HW, WH) completed.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c962fb0a-1412-4188-b192-fc36e1c5df6a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "2ca481a5-e3ef-450e-8626-2fba166497d1",
   "metadata": {},
   "source": [
    "## Analaysis 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eea838fe-fd2f-4489-8681-b92bd81a1b6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "\n",
    "DEST_CANDIDATES = [\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025_DESTinSeoul.parquet\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025_DESTinSeoul.csv\",\n",
    "    BASE / \"df_H2.xlsx\"   \n",
    "]\n",
    "\n",
    "# Optional sources to reconstruct ell_dt if not already in DEST panel\n",
    "CODEBOOK_XLSX = BASE / \"서울생활이동데이터_자치구코드.xlsx\"   # maps code↔name\n",
    "APT_XLSX      = BASE / \"df_APT.xlsx\"                        # district names + monthly prices (wide)\n",
    "\n",
    "OUT_SUMTXT = BASE / \"H3_InboundShare_full_summary_weighted.txt\"\n",
    "OUT_KEYCSV = BASE / \"H3_InboundShare_keycoef_weighted.csv\"\n",
    "\n",
    "# =============================================================================\n",
    "# Utilities\n",
    "# =============================================================================\n",
    "def find_first_existing(paths):\n",
    "    for p in paths:\n",
    "        if p.exists():\n",
    "            return p\n",
    "    raise FileNotFoundError(\"No input file found among:\\n\" + \"\\n\".join(map(str, paths)))\n",
    "\n",
    "def read_dest_panel(path: Path) -> pd.DataFrame:\n",
    "    ext = path.suffix.lower()\n",
    "    if ext == \".parquet\":\n",
    "        df = pd.read_parquet(path)\n",
    "    elif ext == \".csv\":\n",
    "        df = pd.read_csv(path, encoding=\"utf-8-sig\")\n",
    "    elif ext in (\".xlsx\", \".xls\"):\n",
    "        # Try expected sheet name first\n",
    "        try:\n",
    "            df = pd.read_excel(path, sheet_name=\"DEST_H2H3\")\n",
    "        except Exception:\n",
    "            df = pd.read_excel(path, sheet_name=0)\n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported input extension: {ext}\")\n",
    "\n",
    "    df.columns = [str(c).strip() for c in df.columns]\n",
    "\n",
    "    # Required keys\n",
    "    for c in [\"ym\", \"도착시군구코드\"]:\n",
    "        if c not in df.columns:\n",
    "            raise KeyError(f\"DEST panel missing required column: {c}\")\n",
    "\n",
    "    # Normalize keys\n",
    "    df[\"month\"] = (\n",
    "        df[\"ym\"].astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    )\n",
    "    df[\"dest\"] = df[\"도착시군구코드\"].astype(str).str.strip()\n",
    "\n",
    "    # Standardize potential numeric columns if present\n",
    "    for c in [\"HW_flow_inbound\", \"HW_flow_total\",\n",
    "              \"InboundShare_HW\",\n",
    "              \"ell_dt\", \"ln_APT_price_d\", \"ln_price_d\", \"log_price_d\",\n",
    "              \"GapPrice\", \"ell_dt_origin_in\", \"ln_origin_in\"]:\n",
    "        if c in df.columns:\n",
    "            df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
    "\n",
    "    return df\n",
    "\n",
    "def compute_inbound_share(dest_df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Ensure an InboundShare_HW column exists:\n",
    "    InboundShare_HW = HW_flow_inbound / HW_flow_total\n",
    "    \"\"\"\n",
    "    if \"InboundShare_HW\" in dest_df.columns:\n",
    "        return dest_df\n",
    "\n",
    "    # Need both inbound and total HW flows\n",
    "    if (\"HW_flow_inbound\" in dest_df.columns) and (\"HW_flow_total\" in dest_df.columns):\n",
    "        tmp = dest_df.copy()\n",
    "        tmp[\"InboundShare_HW\"] = np.where(\n",
    "            tmp[\"HW_flow_total\"] > 0,\n",
    "            tmp[\"HW_flow_inbound\"] / tmp[\"HW_flow_total\"],\n",
    "            np.nan\n",
    "        )\n",
    "        return tmp\n",
    "\n",
    "    raise KeyError(\n",
    "        \"InboundShare_HW not found and cannot be constructed: \"\n",
    "        \"DEST panel needs 'HW_flow_inbound' and 'HW_flow_total'.\"\n",
    "    )\n",
    "\n",
    "def attach_ell_dt(dest_df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Ensure an ell_dt column exists (log price of destination).\n",
    "    Priority:\n",
    "      1) Use an existing log price column (ell_dt / ln_APT_price_d / ln_price_d / log_price_d).\n",
    "      2) Reconstruct via GapPrice + inbound-weighted origin log price (if available).\n",
    "      3) Merge from APT_XLSX (district names + monthly nominal prices) via CODEBOOK_XLSX,\n",
    "         and set ell_dt = log(nominal price). Month FE absorbs any common deflator.\n",
    "    \"\"\"\n",
    "    # 1) Use existing\n",
    "    cand_logs = [\"ell_dt\", \"ln_APT_price_d\", \"ln_price_d\", \"log_price_d\"]\n",
    "    for c in cand_logs:\n",
    "        if c in dest_df.columns:\n",
    "            out = dest_df.copy()\n",
    "            out[\"ell_dt\"] = pd.to_numeric(out[c], errors=\"coerce\")\n",
    "            return out\n",
    "\n",
    "    # 2) Reconstruct from GapPrice + origin composite if present\n",
    "    origin_cands = [\"ell_dt_origin_in\", \"ln_origin_in\", \"ell_origin_in\", \"ln_origin\"]\n",
    "    if \"GapPrice\" in dest_df.columns:\n",
    "        for oc in origin_cands:\n",
    "            if oc in dest_df.columns:\n",
    "                out = dest_df.copy()\n",
    "                out[\"ell_dt\"] = pd.to_numeric(out[\"GapPrice\"], errors=\"coerce\") + \\\n",
    "                                pd.to_numeric(out[oc], errors=\"coerce\")\n",
    "                return out\n",
    "\n",
    "    # 3) Merge from APT_XLSX using codebook to translate names→codes\n",
    "    if not APT_XLSX.exists():\n",
    "        raise KeyError(\n",
    "            \"Cannot build ell_dt: no log price in DEST, no origin composite, \"\n",
    "            f\"and APT file not found at {APT_XLSX}.\"\n",
    "        )\n",
    "    if not CODEBOOK_XLSX.exists():\n",
    "        raise KeyError(\n",
    "            \"Cannot build ell_dt: no log price in DEST, no origin composite, \"\n",
    "            f\"and codebook not found at {CODEBOOK_XLSX}.\"\n",
    "        )\n",
    "\n",
    "    # Read codebook (expects columns like: '시군구' (code), 'full name' (korean name))\n",
    "    code_df = pd.read_excel(CODEBOOK_XLSX)\n",
    "    code_df.columns = [str(c).strip() for c in code_df.columns]\n",
    "    # pick columns robustly\n",
    "    if \"시군구\" not in code_df.columns or \"full name\" not in code_df.columns:\n",
    "        raise KeyError(\"Codebook must contain columns: '시군구' and 'full name'.\")\n",
    "    code_df[\"시군구\"] = code_df[\"시군구\"].astype(str).str.strip()\n",
    "    code_df[\"full name\"] = code_df[\"full name\"].astype(str).str.strip()\n",
    "\n",
    "    # Read APT (wide: one column with district names + many YYYYMM columns)\n",
    "    apt_wide = pd.read_excel(APT_XLSX)\n",
    "    apt_wide.columns = [str(c).strip() for c in apt_wide.columns]\n",
    "\n",
    "    # Locate district name column\n",
    "    name_col = None\n",
    "    for c in [\"district\", \"District\", \"구\", \"자치구\", \"full name\", \"Full name\"]:\n",
    "        if c in apt_wide.columns:\n",
    "            name_col = c\n",
    "            break\n",
    "    if name_col is None:\n",
    "        raise KeyError(\"APT file must contain a district-name column (e.g., 'district').\")\n",
    "\n",
    "    # Melt to long\n",
    "    value_vars = [c for c in apt_wide.columns if c != name_col]\n",
    "    apt_long = apt_wide.melt(\n",
    "        id_vars=[name_col], var_name=\"ym\", value_name=\"APT_price_dest\"\n",
    "    )\n",
    "    apt_long[\"ym\"] = apt_long[\"ym\"].astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    apt_long[name_col] = apt_long[name_col].astype(str).strip()\n",
    "\n",
    "    # Map district name -> code via codebook's 'full name'\n",
    "    m = code_df.set_index(\"full name\")[\"시군구\"].to_dict()\n",
    "    apt_long[\"dest\"] = apt_long[name_col].map(m).astype(str)\n",
    "    apt_long = apt_long.dropna(subset=[\"dest\"])\n",
    "    apt_long[\"APT_price_dest\"] = pd.to_numeric(apt_long[\"APT_price_dest\"], errors=\"coerce\")\n",
    "\n",
    "    # log nominal price (month FE absorbs CPI)\n",
    "    apt_long[\"ell_dt\"] = np.log(apt_long[\"APT_price_dest\"])\n",
    "\n",
    "    # Merge onto DEST\n",
    "    out = dest_df.merge(\n",
    "        apt_long[[\"dest\", \"ym\", \"ell_dt\"]].rename(columns={\"ym\": \"month\"}),\n",
    "        on=[\"dest\", \"month\"], how=\"left\"\n",
    "    )\n",
    "    if out[\"ell_dt\"].isna().all():\n",
    "        raise KeyError(\"Failed to merge APT prices to DEST panel; check district names and months.\")\n",
    "    return out\n",
    "\n",
    "def run_h3_weighted(dest_df: pd.DataFrame):\n",
    "    \"\"\"\n",
    "    WLS with FE and destination-clustered SE:\n",
    "      InboundShare_HW ~ ell_dt + C(dest) + C(month)\n",
    "      weights = HW_flow_inbound\n",
    "    \"\"\"\n",
    "    # Ensure needed columns\n",
    "    need_cols = [\"dest\", \"month\", \"InboundShare_HW\", \"ell_dt\", \"HW_flow_inbound\"]\n",
    "    missing = [c for c in need_cols if c not in dest_df.columns]\n",
    "    if missing:\n",
    "        raise KeyError(f\"Missing columns for H3: {missing}\")\n",
    "\n",
    "    dat = dest_df[need_cols].copy()\n",
    "    # Clean numeric\n",
    "    for c in [\"InboundShare_HW\", \"ell_dt\", \"HW_flow_inbound\"]:\n",
    "        dat[c] = pd.to_numeric(dat[c], errors=\"coerce\")\n",
    "\n",
    "    # Analysis sample\n",
    "    dat = dat.dropna(subset=[\"InboundShare_HW\", \"ell_dt\", \"HW_flow_inbound\"])\n",
    "    dat = dat.loc[dat[\"HW_flow_inbound\"] > 0]\n",
    "\n",
    "    # Fit model\n",
    "    formula = \"InboundShare_HW ~ ell_dt + C(dest) + C(month)\"\n",
    "    mdl = smf.wls(formula=formula, data=dat, weights=dat[\"HW_flow_inbound\"].astype(float).values)\n",
    "    res = mdl.fit(cov_type=\"cluster\", cov_kwds={\"groups\": dat[\"dest\"]})\n",
    "\n",
    "    # Print FULL summary\n",
    "    print(\"\\n\" + \"=\"*88)\n",
    "    print(\"Full results: H3 — InboundShare_HW on log price (weighted by inbound HW flow)\")\n",
    "    print(\"=\"*88)\n",
    "    print(res.summary())\n",
    "\n",
    "    # Save outputs\n",
    "    with open(OUT_SUMTXT, \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(str(res.summary()))\n",
    "    key = pd.DataFrame({\n",
    "        \"param\": [\"ell_dt\"],\n",
    "        \"coef\":  [res.params.get(\"ell_dt\", np.nan)],\n",
    "        \"se\":    [res.bse.get(\"ell_dt\", np.nan)],\n",
    "        \"t\":     [res.tvalues.get(\"ell_dt\", np.nan)],\n",
    "        \"p\":     [res.pvalues.get(\"ell_dt\", np.nan)],\n",
    "        \"nobs\":  [res.nobs],\n",
    "        \"clusters_dest\": [dat[\"dest\"].nunique()]\n",
    "    })\n",
    "    key.to_csv(OUT_KEYCSV, index=False, encoding=\"utf-8-sig\")\n",
    "    print(f\"[OK] wrote {OUT_SUMTXT}\")\n",
    "    print(f\"[OK] wrote {OUT_KEYCSV}\")\n",
    "\n",
    "    # Quick interpretation\n",
    "    b = res.params.get(\"ell_dt\", np.nan)\n",
    "    if pd.notnull(b):\n",
    "        print(f\"\\nInterpretation: a 10% ↑ in destination price ⇒ {0.10*b:.4f} pp change in InboundShare (linear probability scale).\")\n",
    "\n",
    "    return res\n",
    "\n",
    "# =============================================================================\n",
    "# Run\n",
    "# =============================================================================\n",
    "if __name__ == \"__main__\":\n",
    "    in_path = find_first_existing(DEST_CANDIDATES)\n",
    "    print(f\"[INFO] Using DEST panel: {in_path}\")\n",
    "    dest0 = read_dest_panel(in_path)\n",
    "\n",
    "    # Ensure inbound share\n",
    "    dest1 = compute_inbound_share(dest0)\n",
    "\n",
    "    # Ensure ell_dt (log price at destination)\n",
    "    dest2 = attach_ell_dt(dest1)\n",
    "\n",
    "    # Estimate H3 (weighted main model)\n",
    "    _ = run_h3_weighted(dest2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d5ffe73-8ab1-42ab-aed3-38234bf38310",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "\n",
    "DEST_CANDIDATES = [\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025_DESTinSeoul.parquet\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025_DESTinSeoul.csv\",\n",
    "    BASE / \"df_H2.xlsx\",\n",
    "]\n",
    "\n",
    "# Optional sources to reconstruct ell_dt if not already in DEST panel\n",
    "CODEBOOK_XLSX = BASE / \"서울생활이동데이터_자치구코드.xlsx\"   # maps code↔name\n",
    "APT_XLSX      = BASE / \"df_APT.xlsx\"                        # district names + monthly prices (wide)\n",
    "\n",
    "# Output files\n",
    "OUT_SUMTXT_HW  = BASE / \"H3_InboundShare_HW_full_summary_weighted.txt\"\n",
    "OUT_KEYCSV_HW  = BASE / \"H3_InboundShare_HW_keycoef_weighted.csv\"\n",
    "OUT_SUMTXT_TOT = BASE / \"H3_InboundShare_TOT_full_summary_weighted.txt\"\n",
    "OUT_KEYCSV_TOT = BASE / \"H3_InboundShare_TOT_keycoef_weighted.csv\"\n",
    "\n",
    "# =============================================================================\n",
    "# Utilities\n",
    "# =============================================================================\n",
    "def find_first_existing(paths):\n",
    "    for p in paths:\n",
    "        if p.exists():\n",
    "            return p\n",
    "    raise FileNotFoundError(\"No input file found among:\\n\" + \"\\n\".join(map(str, paths)))\n",
    "\n",
    "def read_dest_panel(path: Path) -> pd.DataFrame:\n",
    "    ext = path.suffix.lower()\n",
    "    if ext == \".parquet\":\n",
    "        df = pd.read_parquet(path)\n",
    "    elif ext == \".csv\":\n",
    "        df = pd.read_csv(path, encoding=\"utf-8-sig\")\n",
    "    elif ext in (\".xlsx\", \".xls\"):\n",
    "        try:\n",
    "            df = pd.read_excel(path, sheet_name=\"DEST_H2H3\")\n",
    "        except Exception:\n",
    "            df = pd.read_excel(path, sheet_name=0)\n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported input extension: {ext}\")\n",
    "\n",
    "    df.columns = [str(c).strip() for c in df.columns]\n",
    "\n",
    "    # Required keys\n",
    "    for c in [\"ym\", \"도착시군구코드\"]:\n",
    "        if c not in df.columns:\n",
    "            raise KeyError(f\"DEST panel missing required column: {c}\")\n",
    "\n",
    "    # Normalize keys\n",
    "    df[\"month\"] = (\n",
    "        df[\"ym\"].astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    )\n",
    "    df[\"dest\"] = df[\"도착시군구코드\"].astype(str).str.strip()\n",
    "\n",
    "    # Coerce likely numeric\n",
    "    for c in [\n",
    "        \"HW_flow_inbound\",\"HW_flow_total\",\n",
    "        \"WH_flow_inbound\",\"WH_flow_total\",\n",
    "        \"TOT_flow_inbound\",\"TOT_flow_total\",\n",
    "        \"InboundShare_HW\",\"InboundShare_TOT\",\n",
    "        \"ell_dt\",\"ln_APT_price_d\",\"ln_price_d\",\"log_price_d\",\n",
    "        \"GapPrice\",\"ell_dt_origin_in\",\"ln_origin_in\"\n",
    "    ]:\n",
    "        if c in df.columns:\n",
    "            df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
    "\n",
    "    return df\n",
    "\n",
    "def compute_inbound_share_HW(dest_df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"Ensure InboundShare_HW exists.\"\"\"\n",
    "    if \"InboundShare_HW\" in dest_df.columns:\n",
    "        return dest_df\n",
    "    if (\"HW_flow_inbound\" in dest_df.columns) and (\"HW_flow_total\" in dest_df.columns):\n",
    "        out = dest_df.copy()\n",
    "        out[\"InboundShare_HW\"] = np.where(\n",
    "            out[\"HW_flow_total\"] > 0,\n",
    "            out[\"HW_flow_inbound\"] / out[\"HW_flow_total\"],\n",
    "            np.nan,\n",
    "        )\n",
    "        return out\n",
    "    raise KeyError(\"Cannot construct InboundShare_HW: need 'HW_flow_inbound' and 'HW_flow_total'.\")\n",
    "\n",
    "def compute_inbound_share_TOT(dest_df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Define TOTAL-flow inbound share if possible.\n",
    "    Priority:\n",
    "      (a) use existing TOT_flow_* columns\n",
    "      (b) else if WH_flow_* also present, set TOT = HW + WH\n",
    "      (c) else fall back to HW share and HW weights with a note\n",
    "    \"\"\"\n",
    "    out = dest_df.copy()\n",
    "\n",
    "    # (a) already present\n",
    "    if (\"TOT_flow_inbound\" in out.columns) and (\"TOT_flow_total\" in out.columns):\n",
    "        pass\n",
    "    # (b) construct from HW + WH if available\n",
    "    elif (\n",
    "        (\"HW_flow_inbound\" in out.columns) and (\"HW_flow_total\" in out.columns) and\n",
    "        (\"WH_flow_inbound\" in out.columns) and (\"WH_flow_total\" in out.columns)\n",
    "    ):\n",
    "        out[\"TOT_flow_inbound\"] = (out[\"HW_flow_inbound\"].fillna(0) +\n",
    "                                   out[\"WH_flow_inbound\"].fillna(0))\n",
    "        out[\"TOT_flow_total\"]   = (out[\"HW_flow_total\"].fillna(0)   +\n",
    "                                   out[\"WH_flow_total\"].fillna(0))\n",
    "    else:\n",
    "        # (c) graceful fallback\n",
    "        if \"InboundShare_HW\" not in out.columns:\n",
    "            out = compute_inbound_share_HW(out)\n",
    "        out[\"InboundShare_TOT\"]  = out[\"InboundShare_HW\"]\n",
    "        # if weights missing, reuse HW inbound as analytic weights\n",
    "        if \"TOT_flow_inbound\" not in out.columns:\n",
    "            out[\"TOT_flow_inbound\"] = out.get(\"HW_flow_inbound\", np.nan)\n",
    "        if \"TOT_flow_total\" not in out.columns:\n",
    "            out[\"TOT_flow_total\"] = out.get(\"HW_flow_total\", np.nan)\n",
    "        print(\"[INFO] DEST lacks total-flow counts; using InboundShare_TOT ≡ InboundShare_HW and HW inbound weights.\")\n",
    "        return out\n",
    "\n",
    "    # compute share from TOT flows\n",
    "    out[\"InboundShare_TOT\"] = np.where(\n",
    "        out[\"TOT_flow_total\"] > 0,\n",
    "        out[\"TOT_flow_inbound\"] / out[\"TOT_flow_total\"],\n",
    "        np.nan,\n",
    "    )\n",
    "    return out\n",
    "\n",
    "def attach_ell_dt(dest_df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Ensure an ell_dt column exists (log price at destination).\n",
    "      1) prefer existing log-price columns\n",
    "      2) reconstruct from GapPrice + origin composite if available\n",
    "      3) merge from df_APT.xlsx via codebook and take log(nominal price)\n",
    "    \"\"\"\n",
    "    # 1) Use existing\n",
    "    for c in [\"ell_dt\",\"ln_APT_price_d\",\"ln_price_d\",\"log_price_d\"]:\n",
    "        if c in dest_df.columns:\n",
    "            out = dest_df.copy()\n",
    "            out[\"ell_dt\"] = pd.to_numeric(out[c], errors=\"coerce\")\n",
    "            return out\n",
    "\n",
    "    # 2) Reconstruct\n",
    "    origin_cands = [\"ell_dt_origin_in\",\"ln_origin_in\",\"ell_origin_in\",\"ln_origin\"]\n",
    "    if \"GapPrice\" in dest_df.columns:\n",
    "        for oc in origin_cands:\n",
    "            if oc in dest_df.columns:\n",
    "                out = dest_df.copy()\n",
    "                out[\"ell_dt\"] = pd.to_numeric(out[\"GapPrice\"], errors=\"coerce\") + \\\n",
    "                                pd.to_numeric(out[oc], errors=\"coerce\")\n",
    "                return out\n",
    "\n",
    "    # 3) Merge from APT (wide) using codebook names\n",
    "    if not APT_XLSX.exists():\n",
    "        raise KeyError(\"ell_dt missing and APT file not found at {APT_XLSX}.\")\n",
    "    if not CODEBOOK_XLSX.exists():\n",
    "        raise KeyError(\"ell_dt missing and codebook not found at {CODEBOOK_XLSX}.\")\n",
    "\n",
    "    code_df = pd.read_excel(CODEBOOK_XLSX)\n",
    "    code_df.columns = [str(c).strip() for c in code_df.columns]\n",
    "    if \"시군구\" not in code_df.columns or \"full name\" not in code_df.columns:\n",
    "        raise KeyError(\"Codebook must have '시군구' and 'full name'.\")\n",
    "    code_df[\"시군구\"] = code_df[\"시군구\"].astype(str).str.strip()\n",
    "    code_df[\"full name\"] = code_df[\"full name\"].astype(str).str.strip()\n",
    "\n",
    "    apt_wide = pd.read_excel(APT_XLSX)\n",
    "    apt_wide.columns = [str(c).strip() for c in apt_wide.columns]\n",
    "\n",
    "    name_col = None\n",
    "    for c in [\"district\",\"District\",\"구\",\"자치구\",\"full name\",\"Full name\"]:\n",
    "        if c in apt_wide.columns:\n",
    "            name_col = c\n",
    "            break\n",
    "    if name_col is None:\n",
    "        raise KeyError(\"APT file must contain a district-name column (e.g., 'district').\")\n",
    "\n",
    "    value_vars = [c for c in apt_wide.columns if c != name_col]\n",
    "    apt_long = apt_wide.melt(id_vars=[name_col], var_name=\"ym\", value_name=\"APT_price_dest\")\n",
    "    apt_long[\"ym\"] = apt_long[\"ym\"].astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    apt_long[name_col] = apt_long[name_col].astype(str).str.strip()\n",
    "\n",
    "    name_to_code = code_df.set_index(\"full name\")[\"시군구\"].to_dict()\n",
    "    apt_long[\"dest\"] = apt_long[name_col].map(name_to_code).astype(str)\n",
    "    apt_long = apt_long.dropna(subset=[\"dest\"])\n",
    "    apt_long[\"APT_price_dest\"] = pd.to_numeric(apt_long[\"APT_price_dest\"], errors=\"coerce\")\n",
    "    apt_long[\"ell_dt\"] = np.log(apt_long[\"APT_price_dest\"])\n",
    "\n",
    "    out = dest_df.merge(\n",
    "        apt_long[[\"dest\",\"ym\",\"ell_dt\"]].rename(columns={\"ym\":\"month\"}),\n",
    "        on=[\"dest\",\"month\"], how=\"left\"\n",
    "    )\n",
    "    if out[\"ell_dt\"].isna().all():\n",
    "        raise KeyError(\"Failed to attach ell_dt from APT file; check names/months.\")\n",
    "    return out\n",
    "\n",
    "def run_h3_weighted(dest_df: pd.DataFrame,\n",
    "                    dep_col: str,\n",
    "                    weight_col: str,\n",
    "                    sum_path: Path,\n",
    "                    key_path: Path,\n",
    "                    title: str):\n",
    "    \"\"\"\n",
    "    WLS with FE and destination-clustered SE:\n",
    "        dep_col ~ ell_dt + C(dest) + C(month)\n",
    "        weights = weight_col\n",
    "    \"\"\"\n",
    "    need = [\"dest\",\"month\",dep_col,\"ell_dt\",weight_col]\n",
    "    miss = [c for c in need if c not in dest_df.columns]\n",
    "    if miss:\n",
    "        raise KeyError(f\"Missing columns for H3 ({title}): {miss}\")\n",
    "\n",
    "    dat = dest_df[need].copy()\n",
    "    for c in [dep_col,\"ell_dt\",weight_col]:\n",
    "        dat[c] = pd.to_numeric(dat[c], errors=\"coerce\")\n",
    "\n",
    "    dat = dat.dropna(subset=[dep_col,\"ell_dt\",weight_col])\n",
    "    dat = dat.loc[dat[weight_col] > 0]\n",
    "\n",
    "    formula = f\"{dep_col} ~ ell_dt + C(dest) + C(month)\"\n",
    "    mdl = smf.wls(formula=formula, data=dat, weights=dat[weight_col].astype(float).values)\n",
    "    res = mdl.fit(cov_type=\"cluster\", cov_kwds={\"groups\": dat[\"dest\"]})\n",
    "\n",
    "    print(\"\\n\" + \"=\"*92)\n",
    "    print(f\"Full results: H3 — {title}\")\n",
    "    print(\"=\"*92)\n",
    "    print(res.summary())\n",
    "\n",
    "    with open(sum_path, \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(str(res.summary()))\n",
    "    key = pd.DataFrame({\n",
    "        \"param\": [\"ell_dt\"],\n",
    "        \"coef\":  [res.params.get(\"ell_dt\", np.nan)],\n",
    "        \"se\":    [res.bse.get(\"ell_dt\", np.nan)],\n",
    "        \"t\":     [res.tvalues.get(\"ell_dt\", np.nan)],\n",
    "        \"p\":     [res.pvalues.get(\"ell_dt\", np.nan)],\n",
    "        \"nobs\":  [res.nobs],\n",
    "        \"clusters_dest\": [dat[\"dest\"].nunique()]\n",
    "    })\n",
    "    key.to_csv(key_path, index=False, encoding=\"utf-8-sig\")\n",
    "    print(f\"[OK] wrote {sum_path}\")\n",
    "    print(f\"[OK] wrote {key_path}\")\n",
    "\n",
    "    b = res.params.get(\"ell_dt\", np.nan)\n",
    "    if pd.notnull(b):\n",
    "        print(f\"\\nInterpretation: 10% ↑ in destination price ⇒ {0.10*b:.4f} pp change in {dep_col} (linear prob scale).\")\n",
    "\n",
    "    return res\n",
    "\n",
    "# =============================================================================\n",
    "# Run\n",
    "# =============================================================================\n",
    "if __name__ == \"__main__\":\n",
    "    in_path = find_first_existing(DEST_CANDIDATES)\n",
    "    print(f\"[INFO] Using DEST panel: {in_path}\")\n",
    "    dest0 = read_dest_panel(in_path)\n",
    "\n",
    "    # Ensure shares\n",
    "    dest1 = compute_inbound_share_HW(dest0)          # InboundShare_HW\n",
    "    dest2 = compute_inbound_share_TOT(dest1)         # InboundShare_TOT (or fallback)\n",
    "\n",
    "    # Ensure destination log price\n",
    "    dest3 = attach_ell_dt(dest2)\n",
    "\n",
    "    # --- H3 (HW share, HW inbound weights) ---\n",
    "    _ = run_h3_weighted(\n",
    "        dest_df = dest3,\n",
    "        dep_col = \"InboundShare_HW\",\n",
    "        weight_col = \"HW_flow_inbound\",\n",
    "        sum_path = OUT_SUMTXT_HW,\n",
    "        key_path = OUT_KEYCSV_HW,\n",
    "        title   = \"InboundShare_HW on log price (weights = inbound HW flow)\"\n",
    "    )\n",
    "\n",
    "    # --- H3 (TOTAL share, TOTAL inbound weights if available) ---\n",
    "    wcol = \"TOT_flow_inbound\" if \"TOT_flow_inbound\" in dest3.columns else \"HW_flow_inbound\"\n",
    "    _ = run_h3_weighted(\n",
    "        dest_df = dest3,\n",
    "        dep_col = \"InboundShare_TOT\",\n",
    "        weight_col = wcol,\n",
    "        sum_path = OUT_SUMTXT_TOT,\n",
    "        key_path = OUT_KEYCSV_TOT,\n",
    "        title   = f\"InboundShare_TOT on log price (weights = {'inbound TOTAL flow' if wcol=='TOT_flow_inbound' else 'inbound HW flow (fallback)'})\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b63280f-c978-4fa5-b1ff-57f47b62e763",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0dccf6c-e477-436a-99dc-738e1213423a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b86b8e5-991e-46a7-8c84-d82a46dbaa49",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3221db25-7e4c-453f-9906-d865ddf3b9ad",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "991c172c-6448-47a9-8fd1-4ca217e53f63",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af36650f-d053-4dd3-9965-528f0754b831",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc7fefbf-0715-4f7f-8363-de29f002d484",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21adb620-ab03-4d4e-80c9-4959308fd5c4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
